data_folder <- "Data/Bike-Sharing-Dataset"
files <- list.files(data_folder, full.names = T)
files
[1] "Data/Bike-Sharing-Dataset/day.csv"
[2] "Data/Bike-Sharing-Dataset/hour.csv"
[3] "Data/Bike-Sharing-Dataset/Readme.txt"
day <- read_csv(files[[1]])
hour <- read_csv(files[[2]])
# A tibble: 731 x 16
instant dteday season yr mnth holiday weekday workingday
<dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2011-01-01 1 0 1 0 6 0
2 2 2011-01-02 1 0 1 0 0 0
3 3 2011-01-03 1 0 1 0 1 1
4 4 2011-01-04 1 0 1 0 2 1
5 5 2011-01-05 1 0 1 0 3 1
6 6 2011-01-06 1 0 1 0 4 1
7 7 2011-01-07 1 0 1 0 5 1
8 8 2011-01-08 1 0 1 0 6 0
9 9 2011-01-09 1 0 1 0 0 0
10 10 2011-01-10 1 0 1 0 1 1
# … with 721 more rows, and 8 more variables: weathersit <dbl>,
# temp <dbl>, atemp <dbl>, hum <dbl>, windspeed <dbl>, casual <dbl>,
# registered <dbl>, cnt <dbl>
Data summary
| Name |
Piped data |
| Number of rows |
731 |
| Number of columns |
16 |
| _______________________ |
|
| Column type frequency: |
|
| Date |
1 |
| numeric |
15 |
| ________________________ |
|
| Group variables |
None |
Variable type: Date
| dteday |
0 |
1 |
2011-01-01 |
2012-12-31 |
2012-01-01 |
731 |
Variable type: numeric
| instant |
0 |
1 |
366.00 |
211.17 |
1.00 |
183.50 |
366.00 |
548.50 |
731.00 |
▇▇▇▇▇ |
| season |
0 |
1 |
2.50 |
1.11 |
1.00 |
2.00 |
3.00 |
3.00 |
4.00 |
▇▇▁▇▇ |
| yr |
0 |
1 |
0.50 |
0.50 |
0.00 |
0.00 |
1.00 |
1.00 |
1.00 |
▇▁▁▁▇ |
| mnth |
0 |
1 |
6.52 |
3.45 |
1.00 |
4.00 |
7.00 |
10.00 |
12.00 |
▇▅▅▅▇ |
| holiday |
0 |
1 |
0.03 |
0.17 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| weekday |
0 |
1 |
3.00 |
2.00 |
0.00 |
1.00 |
3.00 |
5.00 |
6.00 |
▇▃▃▃▇ |
| workingday |
0 |
1 |
0.68 |
0.47 |
0.00 |
0.00 |
1.00 |
1.00 |
1.00 |
▃▁▁▁▇ |
| weathersit |
0 |
1 |
1.40 |
0.54 |
1.00 |
1.00 |
1.00 |
2.00 |
3.00 |
▇▁▅▁▁ |
| temp |
0 |
1 |
0.50 |
0.18 |
0.06 |
0.34 |
0.50 |
0.66 |
0.86 |
▂▇▇▇▅ |
| atemp |
0 |
1 |
0.47 |
0.16 |
0.08 |
0.34 |
0.49 |
0.61 |
0.84 |
▂▇▆▇▂ |
| hum |
0 |
1 |
0.63 |
0.14 |
0.00 |
0.52 |
0.63 |
0.73 |
0.97 |
▁▁▆▇▂ |
| windspeed |
0 |
1 |
0.19 |
0.08 |
0.02 |
0.13 |
0.18 |
0.23 |
0.51 |
▃▇▅▁▁ |
| casual |
0 |
1 |
848.18 |
686.62 |
2.00 |
315.50 |
713.00 |
1096.00 |
3410.00 |
▇▆▂▁▁ |
| registered |
0 |
1 |
3656.17 |
1560.26 |
20.00 |
2497.00 |
3662.00 |
4776.50 |
6946.00 |
▂▅▇▅▃ |
| cnt |
0 |
1 |
4504.35 |
1937.21 |
22.00 |
3152.00 |
4548.00 |
5956.00 |
8714.00 |
▂▅▇▅▃ |
day %>% ggplot() + geom_point(aes(x = atemp, y = cnt, color = factor(season))) +
theme_bw()

cats <- day %>% map(~length(unique(.))) %>% cbind %>% data.frame() %>% rownames_to_column() %>%
unnest %>% setNames(nm = c("var", "n"))
vars_implicit_catgory <- cats$var[cats$n < 5]
vars_implicit_catgory
[1] "season" "yr" "holiday" "workingday" "weathersit"
lapply(vars_implicit_catgory, function(v) {
day %>% mutate(`:=`(!!sym(v), factor(!!sym(v)))) %>% ggplot() + geom_violin(aes_string(x = v,
y = "cnt", color = v), fill = "gray80", alpha = 0.1) + geom_jitter(aes_string(x = v,
y = "cnt", color = v), alpha = 0.2, width = 0.1) + theme_light()
})
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

day <- day %>% mutate(season = factor(season), holiday = factor(holiday), workingday = factor(workingday),
weathersit = factor(weathersit)) %>% mutate(days_since_2011 = lubridate::interval(lubridate::ymd("2011-01-01"),
dteday)/lubridate::ddays(1))
set.seed(123)
ind_tr <- (1:nrow(day))[sample(x = c(T, F), size = nrow(day), prob = c(0.7,
0.3), replace = T)]
ind_te <- setdiff(1:nrow(day), ind_tr)
day_m_tr <- day[ind_tr, ] %>% mutate(split = "train")
day_m_te <- day[ind_te, ] %>% mutate(split = "test")
unused_cols <- c("casual", "registered", "instant", "dteday", "yr", "mnth",
"weekday", "split")
lm_01 <- model <- step(lm(formula = cnt ~ ., data = day_m_tr %>% select(-one_of(unused_cols))),
direction = "both")
Start: AIC=6886.74
cnt ~ season + holiday + workingday + weathersit + temp + atemp +
hum + windspeed + days_since_2011
Df Sum of Sq RSS AIC
- atemp 1 966 354721851 6884.7
<none> 354720885 6886.7
- workingday 1 2070471 356791356 6887.7
- holiday 1 6919410 361640294 6894.6
- temp 1 9353511 364074395 6898.0
- hum 1 18451116 373172001 6910.6
- windspeed 1 26023229 380744114 6920.8
- weathersit 2 34273585 388994470 6929.8
- season 3 58546210 413267095 6958.6
- days_since_2011 1 435762280 790483165 7293.4
Step: AIC=6884.74
cnt ~ season + holiday + workingday + weathersit + temp + hum +
windspeed + days_since_2011
Df Sum of Sq RSS AIC
<none> 354721851 6884.7
- workingday 1 2071470 356793321 6885.7
+ atemp 1 966 354720885 6886.7
- holiday 1 6921957 361643808 6892.6
- hum 1 18510236 373232087 6908.7
- windspeed 1 26960785 381682636 6920.1
- weathersit 2 34356488 389078339 6927.9
- season 3 59877458 414599309 6958.3
- temp 1 157523827 512245678 7070.2
- days_since_2011 1 435876229 790598080 7291.5
Call:
lm(formula = cnt ~ season + holiday + workingday + weathersit +
temp + hum + windspeed + days_since_2011, data = day_m_tr %>%
select(-one_of(unused_cols)))
Coefficients:
(Intercept) season2 season3 season4
1644.81 762.63 -129.44 171.63
holiday1 workingday1 weathersit2 weathersit3
-717.59 145.13 -347.78 -2015.12
temp hum windspeed days_since_2011
5517.46 -1897.10 -3227.30 4.95
lm_results <- list(day_m_tr, day_m_te) %>% bind_rows %>% group_by(split) %>%
nest %>% mutate(pred = map(data, ~predict(object = lm_01, newdata = .))) %>%
mutate(data = map2(data, pred, ~bind_cols(data, pred))) %>% select(data) %>%
unnest %>% ungroup
lm_results %>% ggplot() + geom_point(aes(x = cnt, y = V1, color = split, shape = split),
size = 3, alpha = 0.3) + theme_bw() + scale_color_aaas() + theme(axis.text.x = element_text(size = 12),
axis.title.x = element_text(size = 14), axis.text.y = element_text(size = 12),
axis.title.y = element_text(size = 14), legend.background = element_rect(color = "gray70"),
legend.position = c(0, 1), legend.justification = c(0, 1), legend.text = element_text(size = 12),
legend.title = element_text(size = 14)) + ylab("predicted") + xlab("count")

library(randomForest)
rf_01 <- model <- randomForest(cnt ~ ., data = day_m_tr %>% select(-one_of(unused_cols)),
importance = TRUE)
tr_new <- data.frame(cnt = day_m_tr$cnt, model.matrix(cnt ~ ., data = day_m_tr %>%
select(-one_of(unused_cols))) %>% data.frame %>% select(-1)) %>% add_column(split = "train",
.before = 1)
te_new <- data.frame(cnt = day_m_te$cnt, model.matrix(cnt ~ ., data = day_m_te %>%
select(-one_of(unused_cols))) %>% data.frame %>% select(-1)) %>% add_column(split = "test",
.before = 1)
rf_02 <- randomForest(cnt ~ ., data = tr_new %>% select(-split), importance = T)
rf_results_01 <- bind_rows(day_m_tr, day_m_te) %>% group_by(split) %>% nest %>%
mutate(pred = map(data, ~predict(rf_01, newdata = .))) %>% mutate(data = map2(data,
pred, ~bind_cols(data, pred))) %>% select(data) %>% unnest %>% ungroup
rf_results_02 <- bind_rows(tr_new, te_new) %>% group_by(split) %>% nest %>%
mutate(pred = map(data, ~predict(rf_02, newdata = .))) %>% mutate(data = map2(data,
pred, ~bind_cols(data, pred))) %>% select(data) %>% unnest %>% ungroup
rmse_rf_01 <- rf_results_01 %>% group_by(split) %>% summarize(a = sqrt(mean((V1 -
cnt)^2)))
rmse_rf_01
# A tibble: 2 x 2
split a
<chr> <dbl>
1 test 663.
2 train 321.
rmse_rf_02 <- rf_results_02 %>% group_by(split) %>% summarize(a = sqrt(mean((V1 -
cnt)^2)))
rmse_rf_02
# A tibble: 2 x 2
split a
<chr> <dbl>
1 test 670.
2 train 330.
rmse_lm <- lm_results %>% group_by(split) %>% summarize(a = sqrt(mean((V1 -
cnt)^2)))
rmse_lm
# A tibble: 2 x 2
split a
<chr> <dbl>
1 test 993.
2 train 834.
rf_results_01 %>% ggplot() + geom_point(aes(x = cnt, y = V1, color = split,
fill = split, shape = split), size = 3, alpha = 0.3) + theme_bw() + scale_color_aaas() +
theme(legend.position = c(0, 1), legend.justification = c(0, 1), legend.background = element_rect(color = "gray70"),
legend.text = element_text(size = 12), legend.title = element_text(size = 14),
axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 14),
axis.text.y = element_text(s = 12), axis.title.y = element_text(s = 14)) +
ylab("predicted") + xlab("y")

rf_imp <- sort(randomForest::importance(rf_01, type = 1, scale = FALSE)[, 1],
decreasing = T)
rf_imp %>% data.frame(importance = .) %>% rownames_to_column("feature") %>%
ggplot() + geom_bar(aes(x = factor(feature, levels = names(rf_imp)), y = importance),
stat = "identity", fill = "skyblue3", color = "skyblue4") + theme_bw() +
theme(axis.title.x = element_text(size = 14), axis.text.x = element_text(size = 12,
angle = -90), axis.title.y = element_text(size = 14), axis.text.y = element_text(size = 12)) +
xlab("factor")

library(pdp)
ice_rf <- partial(object = rf_01, pred.var = "temp", ice = TRUE)
plotPartial(ice_rf)

Customized plot function
plotICE <- function(model, feature, center) {
obj <- partial(object = model, pred.var = feature, ice = TRUE, center = center)
mean_obj <- obj %>% group_by(!!sym(feature)) %>% summarize(yhat = mean(yhat))
ggplot(obj) + geom_line(aes(x = !!sym(feature), y = yhat, group = yhat.id),
alpha = 0.2) + geom_line(aes(x = !!sym(feature), y = yhat), color = "dodgerblue",
size = 1.2, data = mean_obj) + theme_bw() + theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14), axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12))
}
plotPDP <- function(model, feature, target_var, data) {
obj <- partial(object = model, pred.var = feature, ice = F)
mean_y <- data[[target_var]] %>% mean
lowess_df <- lowess(x = day_m_tr[[feature]], y = day_m_tr[[target_var]]) %>%
data.frame()
ggplot() + geom_line(aes(x = !!sym(feature), y = yhat), size = 1.2, color = "dodgerblue",
data = obj) + geom_point(aes(x = !!sym(feature), y = !!sym(target_var)),
alpha = 0.2, data = data) + geom_line(aes(x = x, y = y), data = lowess_df,
color = "darkorange", size = 1.2) + geom_abline(intercept = mean_y,
slope = 0, color = "gray60", linetype = 2) + theme_bw() + theme(axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14), axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12))
}
p1 <- plotICE(rf_01, "days_since_2011", F)
p2 <- plotICE(rf_01, "temp", F)
p3 <- plotICE(rf_01, "atemp", F)
p4 <- plotICE(rf_01, "hum", F)
gridExtra::grid.arrange(p1, p2, p3, p4, nrow = 2)

p5 <- plotICE(rf_01, "days_since_2011", T)
p6 <- plotICE(rf_01, "temp", T)
p7 <- plotICE(rf_01, "atemp", T)
p8 <- plotICE(rf_01, "hum", T)
gridExtra::grid.arrange(p5, p6, p7, p8, nrow = 2)

p1 <- plotPDP(rf_01, "days_since_2011", "cnt", day_m_tr)
p2 <- plotPDP(rf_01, "temp", "cnt", day_m_tr)
p3 <- plotPDP(rf_01, "atemp", "cnt", day_m_tr)
p4 <- plotPDP(rf_01, "hum", "cnt", day_m_tr)
gridExtra::grid.arrange(p1, p2, p3, p4, nrow = 2)

pdp2 <- partial(object = rf_01, pred.var = c("days_since_2011", "temp"), chull = T)
plotPartial(pdp2)

plotPartial(pdp2, levelplot = F, drape = T, screen = list(z = -20, x = -70))
